<?php

namespace StudyBuddy\String;

/**
 * Class for manipulating html string
 * as DOMDocument
 *
 * The super class of this class is DOMDocument,
 * so it automatically has all the methods of php's
 * DOMDocument class, meaning also all methods of DOMNode class
 *
 *
 * This class also has some regex-based
 * manipulations methods
 * All regex-based functions of this class
 * are utf-8 safe and the actual string
 * represented by this class is also
 * always guaranteed to be in UTF-8 charset
 * because it takes Utf8String object as input
 *
 * @Important always use HTMLStringParser::factory(Utf8string $s)
 * to instantiate this object!
 * Never instantiate with the new keyword!
 * 
 */
class HTMLStringParser extends HTMLString {

    /**
     * Array of words to hilight in string
     *
     * @var array
     */
    protected $pattern;
    protected $aWords = array();

    /**
     * Opening html tag for hilighted word
     * Should use specific class name so
     * that later these nodes can be easily found
     * in case we need to 'de-hilighted' the string,
     * meaning we can easily find these by classname
     *
     *
     * @var string
     */
    protected $hlTag = 'em';
    protected $hlClass = "wtag";

    /**
     * Counter of links created by the
     * linkify() method
     * We keep track of this in order to know
     * if we should reload() the object
     * after calling linkify()
     *
     * @var int
     */
    protected $linkified = 0;

    /**
     * Array of tag names that
     * should be excluded from all type
     * of text manipulation inside them
     * This is for the hilight() and
     * linkify() methods
     *
     * @var array
     */
    protected $aExcluded = array(
        'a',
        'br',
        'img',
        'hr',
        'script',
        'object',
        'head',
        'code',
        'pre',
        'textarea',
        'input',
        'button',
        'map',
        'select',
        'var',
        'base',
        'meta',
        'link',
        'style',
        'title');

    protected function getPattern() {
        return $this->pattern;
    }

    /**
     * Wrap all words matching any of the
     * words in passed array inside the special tags
     * usually the <em></em> tag is used
     * These tags also have special classname, which
     * later could be used to find all elements that were
     * added as a result of this method, so all these
     * tags can be removed if there is a need to
     * "unhihglight" the string.
     *
     *
     * @param array $aWords array of words that should
     * be hightlighted in this html document
     *
     * @return object $this
     */
    public function hilightWords(array $aWords) {
        $this->aWords = $aWords;
        array_walk($aWords, function(&$key) {
                    $key = \preg_quote($key);
                });

        if (!empty($aWords)) {
            $this->pattern = implode('|', $aWords);
            $this->hilight();

            /**
             *
             * Rign now the highliged strings are NOT
             * in the node tree
             *
             * Call $this->reload()
             * if it's desired to to turn all the CDATA section
             * into the actual nodes in the document DOM tree
             *
             */
        }

        return $this;
    }

    /**
     * Internal method, which
     * is also called recursively if needed
     * It wraps the text of text nodes into
     * the special 'highliting' tag (usually 'em')
     * It skips text inside the elements that should
     * be excluded (like 'i' and 'em' tags)
     *
     * @param \DomNode $o
     */
    protected function hilight(\DOMNode $o = null) {
        $o = ($o) ? $o : $this->getElementsByTagName('body')->item(0);
        d("\nNode name: " . $o->nodeName . ' type: ' . $o->nodeType . ' value: ' . $o->nodeValue);


        $nodeName = strtolower($o->nodeName);
        /**
         * Skip nodes that are already the "a" node (link)
         * and skip tags that are already <em> or <i> tags
         */
        if (!in_array($nodeName, $this->aExcluded) && !in_array($nodeName, array('em', 'i'))) {
            if (XML_TEXT_NODE == $o->nodeType) {

                d('passing node to hiInNode');
                /**
                 * Now replace the Node $o with the
                 * new CDATA node (CDATA node will simulate innerHTML)
                 */
                if (\mb_eregi('\b(' . $this->pattern . ')\b', $o->nodeValue)) {
                    $ret = \mb_eregi_replace('\b(' . $this->pattern . ')\b', '<' . $this->hlTag . ' class="' . $this->hlClass . '">' . '\\1' . '</' . $this->hlTag . '>', $o->nodeValue);
                    $CDATA = $o->ownerDocument->createCDATASection($ret);
                    $o->parentNode->replaceChild($CDATA, $o);
                    $this->bCDATA = true;
                }
            }

            /**
             * If node has children then recursively
             * call this method for all child nodes
             */
            $oChildred = $o->childNodes;
            if ($oChildred && ($oChildred->length > 0)) {
                for ($i = 0; $i < $oChildred->length; $i+=1) {
                    $this->hilight($oChildred->item($i));
                }
            }
        }
    }

    /**
     * Remove tags that were previously
     * added by the hilightWords() funcion
     * of this class
     * It changes the underlying html of this object
     * changed html not returned by this method
     *
     * @return object $this
     */
    public function unhilight() {
        $xp = new \DOMXpath($this);
        $query = '//' . $this->hlTag . '[@class=\'' . $this->hlClass . '\']';
        $Nodes = $xp->query($query);

        d('found hilighted: ' . $Nodes->length);

        if ($Nodes && $Nodes->length > 0) {
            for ($i = 0; $i < $Nodes->length; $i +=1) {
                $node = $Nodes->item($i);
                $textNode = $this->createTextNode($node->nodeValue);
                $node->parentNode->replaceChild($textNode, $node);
            }
        }

        return $this;
    }

    /**
     *
     * Parse text nodes and replace text that looks like
     * url with the link to that url, adding rel="nofollow"
     * is required and also shortening the anchor text
     *
     * It will skip the nodes that should be excluded,
     * most importantly will skip the 'a' nodes - so it will
     * not attempt to linkify the text that is already a text of
     * the link. It also skips some elements that can't possibly have
     * interesting text value like 'img', 'br', 'hr', 'script', 'object' nodes
     *
     * This method may recursively call itself if node $o
     * has child node (usually it does)
     *
     * @param \DOMNode $o
     */
    public function linkify(\DOMNode $o = null) {

        $o = ($o) ? $o : $this->getElementsByTagName('body')->item(0);

        /**
         * Skip nodes that are already the "a" node (link)
         * and skip tags that are already <em> or <i> tags
         */
        if (!in_array(strtolower($o->nodeName), $this->aExcluded)) {
            if (XML_TEXT_NODE == $o->nodeType) {
                $nodeValue = $o->nodeValue;
                /**
                 * Callback function to turn long
                 * text of the url (if longer than 50 chars) into shorter text,
                 * using 32 chars from beginng + 15 chars
                 * from end of string with ... in between
                 *
                 * The result is the anchor text of link
                 * will never be longer than 50 chars
                 * This function is utf-8 safe!
                 *
                 * @var anonymous function
                 */
                $func = \create_function('$s', 'if(\mb_strlen($s) < 50 ){return $s;} return \mb_substr($s, 0, 32)."...".\mb_substr($s, -15);');

                /**
                 * Even though this uses preg_replace and NOT mb_eregi_replace
                 * this is safe because patterns here are NOT unicode
                 * and cannot be confused with a part of any unicode chars
                 * Even more important preg_replace is safe with the /e modifier
                 * while mb_eregi_replace is not!
                 * That's why we use preg_replace here!
                 */
                $text = \preg_replace("/(^|[\n ])([\w]*?)((ht|f)tp(s)?:\/\/[\w]+[^ \,\"\n\r\t<]*)/ise", "'\\1\\2<a href=\"\\3\">'.\$func('\\3').'</a>'", $nodeValue, -1, $count);
                $text = \preg_replace("/(^|[\n ])([\w]*?)((www|ftp)\.[^ \,\"\t\n\r<]*)/ise", "'\\1\\2<a href=\"http://\\3\">'.\$func('\\3').'</a>'", $text, -1, $count2);

                /**
                 * Now replace the Node $o with the
                 * new CDATA node (CDATA node will simulate innerHTML)
                 * BUT ONLY if any replacements have actually been done
                 */
                if (!empty($count) || !empty($count2)) {
                    d('replaced something in $nodeValue: ' . $nodeValue);
                    $CDATA = $o->ownerDocument->createCDATASection($text);
                    $o->parentNode->replaceChild($CDATA, $o);
                    $this->bCDATA = true;
                }
            }

            /**
             * If node has children then recursively
             * call this method for all child nodes
             */
            $oChildred = $o->childNodes;
            if ($oChildred && ($oChildred->length > 0)) {
                for ($i = 0; $i < $oChildred->length; $i+=1) {
                    $this->linkify($oChildred->item($i));
                }
            }
        }

        return $this;
    }

    /**
     * Set attributes of ALL links (a nodes)
     * child nodes of this node to 'nofollow'
     * This is useful when parsing the external feed
     * and want to add rel="nofollow" to all links
     *
     * @param bool $setTargetBlank if set to true (default)
     * also ensures that all links also have target="_blank"
     * attribute
     *
     * @return object $this
     */
    public function setNofollow($setTargetBlank = true) {

        if (null !== $Links = $this->getElementsByTagName('a')) {
            for ($i = 0; $i < $Links->length; $i += 1) {
                if ($Links->item($i)->hasAttribute('href')) {
                    //$Links->item($i)->setAttribute('rel', 'nofollow');
                    if ($setTargetBlank) {
                        $Links->item($i)->setAttribute('target', '_blank');
                    }
                }
            }
        }

        return $this;
    }

    /**
     * Add attributes rel="code" class="c"
     * to all 'code' tags
     * this way the html with code tags can be
     * parsed by syntax hilighter
     *
     * @return object $this
     */
    public function parseCodeTags() {
        $aCode = $this->getElementsByTagName('code');
        $numItems = $aCode->length;
        if (!$aCode || 0 == $numItems) {
            d('no code elements');

            return $this;
        }

        for ($i = 0; $i < $numItems; $i += 1) {
            $node = $aCode->item($i);

            if (!$node->hasAttribute('rel')) {
                $node->setAttribute('rel', 'code');
                $node->setAttribute('class', 'c');
            }
        }

        return $this;
    }

    /**
     * Truncate this html string so that total text
     * will be cut to under $maxLen
     * It will remove all DOMDlements from this object
     * that are causing the string to exceed the maxLen length
     * and also will cut the last text node if necessary, adding
     * the ...
     *
     * @todo unfinished
     * This is the best and the safest way to truncate the html string
     *
     * @param int $maxLen
     * @return bool true if html string was truncated
     */
    public function truncate($maxLen) {
        
    }

}
